# PNAS Analysis Pipeline Runner
# Alex F. Gazmararian
# agazmararian@gmail.com
#
# This script runs the complete PNAS analysis pipeline including both
# statements analysis and visibility analysis components.
#
# Run this script from the project root directory.
#
# Pipeline Overview:
# 1. Statements Analysis: Political statement annotation and analysis
# 2. Visibility Analysis: Survey-based proximity effects and credit attribution

# Ensure renv is activated before loading packages
if (!("renv" %in% loadedNamespaces())) {
  # Try to activate renv if not already loaded
  if (file.exists("renv/activate.R")) {
    source("renv/activate.R")
  } else if (file.exists(".Rprofile")) {
    source(".Rprofile")
  }
}

# Load required packages with error handling
tryCatch({
  library(here)
  library(purrr)
}, error = function(e) {
  message("Error loading required packages: ", e$message)
  message("Attempting to restore renv environment...")
  if ("renv" %in% installed.packages()[,"Package"]) {
    renv::restore()
    library(here)
    library(purrr)
  } else {
    stop("Required packages are not available. Please ensure renv is properly set up and run renv::restore()")
  }
})

# Configuration ----
message("=== PNAS ANALYSIS PIPELINE RUNNER ===")
message("This pipeline runs both statements and visibility analyses")
message("")

start_time <- Sys.time()

# Configuration options
SKIP_RENV_CHECK <- getOption("pnas.skip_renv_check", FALSE)
SKIP_STATEMENTS <- getOption("pnas.skip_statements", FALSE)
SKIP_VISIBILITY <- getOption("pnas.skip_visibility", FALSE)
SKIP_ROBUSTNESS <- getOption("pnas.skip_robustness", FALSE)

# PNAS mode: when TRUE (default), only outputs for the paper are generated
# When FALSE, additional diagnostic/internal outputs are also generated
PNAS_MODE <- getOption("pnas.mode", TRUE)
options(pnas.mode = PNAS_MODE)  # Ensure available to child scripts

# Environment Validation ----
message("=== ENVIRONMENT VALIDATION ===")

# Check R version
r_version <- getRversion()
min_r_version <- "4.3.0"
if (r_version < min_r_version) {
  stop(sprintf("R version %s or higher required. Current version: %s", min_r_version, r_version))
}
message(sprintf("[OK] R version: %s", r_version))

# Check for essential directory structure
essential_dirs <- c(
  here("data", "input"),
  here("data", "inter"),
  here("data", "output"),
  here("output", "pnas", "figures"),
  here("output", "pnas", "tables"),
  here("output", "pnas", "stats"),
  here("output", "pnas", "log")
)

missing_dirs <- essential_dirs[!sapply(essential_dirs, dir.exists)]
if (length(missing_dirs) > 0) {
  message("[INFO] Creating missing directories:")
  for (dir in missing_dirs) {
    dir.create(dir, recursive = TRUE, showWarnings = FALSE)
    message(sprintf("  [OK] Created: %s", dir))
  }
}
message("[OK] Essential directory structure ready")

# Set up logging ----
log_dir <- here("output", "pnas", "log")
log_timestamp <- format(start_time, "%Y%m%d_%H%M%S")
pnas_log_file <- file.path(log_dir, paste0("pnas_pipeline_", log_timestamp, ".log"))

# Set unified logging option so child scripts use this log file
options(pnas.unified_log_file = pnas_log_file)

# Enhanced logging function - named pnas_log to avoid conflicts with child scripts
pnas_log <- function(type_or_format, msg_or_args = NULL, ...) {
  timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
  
  tryCatch({
    # Handle different input patterns
    if (!is.null(msg_or_args) && length(list(...)) == 0 && 
        type_or_format %in% c("INFO", "WARN", "ERROR", "DEBUG")) {
      # Pattern 1: pnas_log(type, msg)
      formatted_msg <- sprintf("%s: %s", type_or_format, msg_or_args)
    } else {
      # Pattern 2: pnas_log(format, ...) 
      if (is.null(msg_or_args) && length(list(...)) == 0) {
        formatted_msg <- as.character(type_or_format)
      } else {
        args <- c(list(msg_or_args), list(...))
        formatted_msg <- do.call(sprintf, c(list(type_or_format), args))
      }
    }
    
    # Write to console
    message(sprintf("[%s] %s", timestamp, formatted_msg))
    
    # Write to log file
    if (file.exists(dirname(pnas_log_file))) {
      cat(sprintf("[%s] %s\n", timestamp, formatted_msg), 
          file = pnas_log_file, append = TRUE)
    }
    
  }, error = function(e) {
    # Fallback
    message(sprintf("[%s] Logging error, printing directly: %s", 
                   timestamp, as.character(type_or_format)))
  })
}

# Function to run pipeline components with comprehensive error handling
run_pipeline_component <- function(script_path, component_name, description) {
  pnas_log("=== STARTING %s ===", toupper(component_name))
  pnas_log("Description: %s", description)
  pnas_log("Script: %s", script_path)
  pnas_log("")
  
  start_component_time <- Sys.time()
  warnings_captured <- c()
  
  # Check if file exists
  if (!file.exists(script_path)) {
    error_msg <- sprintf("Script not found: %s", script_path)
    pnas_log("[ERROR] %s", error_msg)
    return(list(error = TRUE, message = error_msg, component = component_name))
  }
  
  result <- tryCatch({
    # Capture output and run with warning capture
    # Use local = TRUE to prevent child scripts from overwriting parent variables
    output_captured <- capture.output({
      withCallingHandlers({
        source(script_path, local = TRUE)
      }, warning = function(w) {
        warning_msg <- conditionMessage(w)
        warnings_captured <<- c(warnings_captured, warning_msg)
        pnas_log("[WARN] WARNING in %s: %s", component_name, warning_msg)
        invokeRestart("muffleWarning")
      })
    }, type = "message")
    
    # Log captured output to main pipeline log
    if (length(output_captured) > 0) {
      pnas_log("=== OUTPUT FROM %s ===", toupper(component_name))
      for (line in output_captured) {
        pnas_log("%s", line)
      }
      pnas_log("=== END OUTPUT FROM %s ===", toupper(component_name))
    }
    
    end_component_time <- Sys.time()
    elapsed <- as.numeric(difftime(end_component_time, start_component_time, units = "mins"))
    
    # Report completion
    if (length(warnings_captured) > 0) {
      pnas_log("[OK] COMPLETED %s (%.1f minutes, %d warnings)", 
                 toupper(component_name), elapsed, length(warnings_captured))
    } else {
      pnas_log("[OK] COMPLETED %s (%.1f minutes)", 
                 toupper(component_name), elapsed)
    }
    pnas_log("")
    
    return(list(success = TRUE, warnings = warnings_captured, elapsed_mins = elapsed))
    
  }, error = function(e) {
    end_component_time <- Sys.time()
    elapsed <- as.numeric(difftime(end_component_time, start_component_time, units = "mins"))
    
    pnas_log("[ERROR] in %s after %.1f minutes: %s", 
               toupper(component_name), elapsed, e$message)
    pnas_log("Script path: %s", script_path)
    pnas_log("")
    
    return(list(error = TRUE, message = e$message, component = component_name, 
               warnings = warnings_captured, elapsed_mins = elapsed))
  })
  
  return(result)
}

# Pipeline Execution ----
pnas_log("Starting PNAS analysis pipeline...")
pnas_log("Timestamp: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
pnas_log("Log file: %s", pnas_log_file)
pnas_log("")

# Display configuration
pnas_log("=== PIPELINE CONFIGURATION ===")
pnas_log("PNAS mode (paper outputs only): %s", ifelse(PNAS_MODE, "YES", "NO"))
pnas_log("Skip statements analysis: %s", ifelse(SKIP_STATEMENTS, "YES", "NO"))
pnas_log("Skip visibility analysis: %s", ifelse(SKIP_VISIBILITY, "YES", "NO"))
pnas_log("Skip robustness checks: %s", ifelse(SKIP_ROBUSTNESS, "YES", "NO"))
pnas_log("Skip renv check: %s", ifelse(SKIP_RENV_CHECK, "YES", "NO"))
pnas_log("")

if (SKIP_STATEMENTS && SKIP_VISIBILITY) {
  pnas_log("[WARN] Both analyses are set to skip. Nothing to do.")
  pnas_log("To run analyses, set options:")
  pnas_log("  options(pnas.skip_statements = FALSE)")
  pnas_log("  options(pnas.skip_visibility = FALSE)")
  stop("No analyses configured to run")
}

# Track results
results <- list()

# Component 1: Statements Analysis ----
if (!SKIP_STATEMENTS) {
  # Set robustness option for statements analysis
  if (SKIP_ROBUSTNESS) {
    options(statements.skip_robustness = TRUE)
  }
  
  results$statements <- run_pipeline_component(
    script_path = here("analysis", "statements", "run_statements_analysis.R"),
    component_name = "statements_analysis",
    description = "Political statement annotation and credit attribution analysis"
  )
  
  # Check if statements analysis failed
  if (isTRUE(results$statements$error)) {
    pnas_log("[WARN] Statements analysis failed.")
  }
} else {
  pnas_log("=== SKIPPING STATEMENTS ANALYSIS ===")
  pnas_log("Statements analysis skipped per configuration")
  pnas_log("")
  results$statements <- list(skipped = TRUE)
}

# Component 2: Visibility Analysis ----
if (!SKIP_VISIBILITY) {
  results$visibility <- run_pipeline_component(
    script_path = here("analysis", "visibility", "run_visibility_analysis.R"),
    component_name = "visibility_analysis", 
    description = "Survey-based proximity effects and political credit attribution analysis"
  )
} else {
  pnas_log("=== SKIPPING VISIBILITY ANALYSIS ===")
  pnas_log("Visibility analysis skipped per configuration")
  pnas_log("")
  results$visibility <- list(skipped = TRUE)
}

# Pipeline Summary ----
end_time <- Sys.time()
total_elapsed <- as.numeric(difftime(end_time, start_time, units = "mins"))

pnas_log("=== PNAS PIPELINE COMPLETE ===")
pnas_log("Total runtime: %.1f minutes", total_elapsed)
pnas_log("Start time: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
pnas_log("End time: %s", format(end_time, "%Y-%m-%d %H:%M:%S"))
pnas_log("")

# Component-wise summary
pnas_log("=== COMPONENT SUMMARY ===")
# Only check the main pipeline components we explicitly defined
main_components <- c("statements", "visibility")
for (component_name in main_components) {
  if (component_name %in% names(results)) {
    result <- results[[component_name]]
    
    # Check if result is a proper list
    if (!is.list(result)) {
      pnas_log("%s: UNEXPECTED RESULT TYPE (not a list)", toupper(component_name))
      next
    }
    
    if (isTRUE(result$skipped)) {
      pnas_log("%s: SKIPPED", toupper(component_name))
    } else if (isTRUE(result$error)) {
      pnas_log("%s: FAILED (%.1f minutes) - %s", 
                 toupper(component_name), result$elapsed_mins %||% 0, result$message)
    } else if (isTRUE(result$success)) {
      warning_text <- if (length(result$warnings) > 0) {
        sprintf(" with %d warnings", length(result$warnings))
      } else {
        ""
      }
      pnas_log("%s: SUCCESS (%.1f minutes)%s", 
                 toupper(component_name), result$elapsed_mins, warning_text)
    }
  } else {
    pnas_log("%s: NOT RUN", toupper(component_name))
  }
}
pnas_log("")

# Error and warning summary - only check main components
main_results <- results[names(results) %in% main_components]
errors <- purrr::keep(main_results, ~ is.list(.) && isTRUE(.$error))
all_warnings <- unlist(lapply(main_results, function(x) {
  if (is.list(x)) {
    x$warnings %||% character(0)
  } else {
    character(0)
  }
}))

if (length(errors) > 0) {
  pnas_log("[ERROR] PIPELINE COMPLETED WITH ERRORS:")
  for (component_name in names(errors)) {
    error_info <- errors[[component_name]]
    pnas_log("  %s: %s", toupper(component_name), error_info$message)
  }
  pnas_log("")
  pnas_log("Check the log file for detailed error information: %s", pnas_log_file)
} else if (length(all_warnings) > 0) {
  pnas_log("[WARN] PIPELINE COMPLETED WITH WARNINGS:")
  pnas_log("Total warnings across all components: %d", length(all_warnings))
  pnas_log("Check the log file for detailed warning information: %s", pnas_log_file)
} else {
  pnas_log("[OK] PIPELINE COMPLETED SUCCESSFULLY!")
  pnas_log("All components ran without errors or warnings")
}

pnas_log("")

# Output summary
pnas_log("=== OUTPUT SUMMARY ===")
pnas_log("Key outputs generated:")

if (is.list(results$statements) && !isTRUE(results$statements$error) && !isTRUE(results$statements$skipped)) {
  pnas_log("Statements Analysis:")
  pnas_log("  - Tables: %s", here("output", "pnas", "tables"))
  pnas_log("  - Figures: %s", here("output", "pnas", "figures"))
  pnas_log("  - Data: %s", here("data", "output", "statements_analysis.csv"))
}

if (is.list(results$visibility) && !isTRUE(results$visibility$error) && !isTRUE(results$visibility$skipped)) {
  pnas_log("Visibility Analysis:")
  pnas_log("  - Tables: %s", here("output", "pnas", "tables"))
  pnas_log("  - Figures: %s", here("output", "pnas", "figures"))
  pnas_log("  - Data: %s", here("data", "output", "visibility_analysis.rds"))
}

pnas_log("Logs: %s", pnas_log_file)
pnas_log("")

# Reproducibility information
pnas_log("=== REPRODUCIBILITY INFORMATION ===")
pnas_log("R Version: %s", R.version.string)
pnas_log("Platform: %s", R.version$platform)
pnas_log("System: %s %s", Sys.info()[["sysname"]], Sys.info()[["release"]])
pnas_log("Working Directory: %s", getwd())
pnas_log("Locale: %s", Sys.getlocale("LC_CTYPE"))
pnas_log("Timezone: %s", Sys.timezone())

# renv status
if ("renv" %in% installed.packages()[,"Package"]) {
  tryCatch({
    if (file.exists("renv.lock")) {
      lockfile_hash <- tools::md5sum("renv.lock")
      pnas_log("renv.lock hash: %s", lockfile_hash)
    }
  }, error = function(e) {
    pnas_log("renv status: error checking - %s", e$message)
  })
}

pnas_log("")
pnas_log("=== PIPELINE EXECUTION COMPLETE ===")
pnas_log("Analysis ready for manuscript preparation!")

# Final exit status
if (length(errors) > 0) {
  message("\n[ERROR] Pipeline completed with errors. Check log file for details.")
  quit(status = 1)
} else if (length(all_warnings) > 0) {
  message("\n[WARN] Pipeline completed with warnings. Check log file for details.")
} else {
  message("\n[OK] Pipeline completed successfully!")
}